library(readr)
White_wines <- read.table("~/Desktop/Big Data/Regression-1/White_wines.csv", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
View(White_wines)
## Warning: running command ''/usr/bin/otool' -L '/Library/Frameworks/
## R.framework/Resources/modules/R_de.so'' had status 1
# Import Data
#View(White_wines)

Summary of Data

Look at a summary of the data.

summary(White_wines)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.878  
##  3rd Qu.:6.000  
##  Max.   :9.000

This dataset is composed of 13 variables. The dependent variable of interest is quality. We will investigate the relationship between the remaining variables (fixed acidity, volatile acid, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, and alcohol) and quality.

Univariate Analysis

Quality appears to be normally distributed with scores ranging from a minimum of 3 to a maximum of 9, with a mean score of of 5.88 and a median of 6.0. A boxplot of quality shows the potential of outliers. These should be considered when interpretting the remainder of the analysis.

with(White_wines, Hist(quality, scale="frequency", breaks="Sturges", 
  col="darkgray"))

Boxplot( ~ quality, data=White_wines, id.method="y")

##  [1] "252"  "254"  "295"  "446"  "741"  "874"  "1035" "1230" "1418" "1485"
## [11] "775"  "821"  "828"  "877"  "1606" "18"   "21"   "23"   "69"   "75"

The distribiution of the remaining variables can be seen in the histograms below. Residual sugar, alcohol, and volatile acid appear to have right skewed distributions, while not perfect, the other variables appear to have a normal distribution.

with(White_wines, Hist(alcohol, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(chlorides, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(citric.acid, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(density, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(fixed.acidity, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(free.sulfur.dioxide, scale="frequency", 
  breaks="Sturges", col="darkgray"))

with(White_wines, Hist(pH, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(residual.sugar, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(sulphates, scale="frequency", breaks="Sturges", 
  col="darkgray"))

with(White_wines, Hist(total.sulfur.dioxide, scale="frequency", 
  breaks="Sturges", col="darkgray"))

with(White_wines, Hist(volatile.acidity, scale="frequency", 
  breaks="Sturges", col="darkgray"))

Multivariate Analysis

To begin investigating potential relationships scattlot matrices have been run below.

Scatterplot Matrix: quality, alcohol, chlorides, citric acid.

scatterplotMatrix(~alcohol+chlorides+citric.acid+quality, reg.line=FALSE, 
  smooth=FALSE, spread=FALSE, span=0.5, ellipse=FALSE, levels=c(.5, .9), 
  id.n=0, diagonal = 'density', data=White_wines)

Scatterplot Matrix: quality, density, fixed acidity, free sulfur dioxide.

scatterplotMatrix(~density+fixed.acidity+free.sulfur.dioxide+quality, 
  reg.line=FALSE, smooth=FALSE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'density', data=White_wines)

Scatterplot Matrix: quality, pH, residual sugar, sulphates.

scatterplotMatrix(~pH+quality+residual.sugar+sulphates, reg.line=FALSE, 
  smooth=FALSE, spread=FALSE, span=0.5, ellipse=FALSE, levels=c(.5, .9), 
  id.n=0, diagonal = 'density', data=White_wines)

Scatterplot Matrix: quality, total sulfur dioxide, volatile acidity.

scatterplotMatrix(~quality+total.sulfur.dioxide+volatile.acidity, 
  reg.line=FALSE, smooth=FALSE, spread=FALSE, span=0.5, ellipse=FALSE, 
  levels=c(.5, .9), id.n=0, diagonal = 'density', data=White_wines)

Linear Correlation analysis shows:

cor(White_wines[,c("alcohol","chlorides","citric.acid","density",
  "fixed.acidity","quality")], use="complete")
##                   alcohol   chlorides  citric.acid    density
## alcohol        1.00000000 -0.36018871 -0.075728730 -0.7801376
## chlorides     -0.36018871  1.00000000  0.114364448  0.2572113
## citric.acid   -0.07572873  0.11436445  1.000000000  0.1495026
## density       -0.78013762  0.25721132  0.149502571  1.0000000
## fixed.acidity -0.12088112  0.02308564  0.289180698  0.2653310
## quality        0.43557472 -0.20993441 -0.009209091 -0.3071233
##               fixed.acidity      quality
## alcohol         -0.12088112  0.435574715
## chlorides        0.02308564 -0.209934411
## citric.acid      0.28918070 -0.009209091
## density          0.26533101 -0.307123313
## fixed.acidity    1.00000000 -0.113662831
## quality         -0.11366283  1.000000000
cor(White_wines[,c("free.sulfur.dioxide","pH","quality","residual.sugar",
  "sulphates")], use="complete")
##                     free.sulfur.dioxide            pH      quality
## free.sulfur.dioxide        1.0000000000 -0.0006177961  0.008158067
## pH                        -0.0006177961  1.0000000000  0.099427246
## quality                    0.0081580671  0.0994272457  1.000000000
## residual.sugar             0.2990983537 -0.1941334540 -0.097576829
## sulphates                  0.0592172458  0.1559514973  0.053677877
##                     residual.sugar   sulphates
## free.sulfur.dioxide     0.29909835  0.05921725
## pH                     -0.19413345  0.15595150
## quality                -0.09757683  0.05367788
## residual.sugar          1.00000000 -0.02666437
## sulphates              -0.02666437  1.00000000
cor(White_wines[,c("quality","total.sulfur.dioxide","volatile.acidity")], 
  use="complete")
##                         quality total.sulfur.dioxide volatile.acidity
## quality               1.0000000           -0.1747372       -0.1947230
## total.sulfur.dioxide -0.1747372            1.0000000        0.0892605
## volatile.acidity     -0.1947230            0.0892605        1.0000000

Variable Correlation With quality (r)

alcohol 0.435574715 chlorides -0.209934411 citric.acid -0.009209091 density -0.307123313 fixed.acidity -0.113662831 free.sulfur.dioxide 0.008158067 pH 0.099427246 residual.sugar -0.097576829 sulphates 0.053677877 total.sulfur.dioxide -0.1747372 volatile.acidity -0.1947230

There seems to be a weak positive relationship between alcohol and quality. Density, chlorides, total sulfur dioxide, and volatile acid, seem to have the strongest negative correlations with quality.

Linear Regressions

To further investigate potential relationships between quality and the variables linear regressions have been run below.

Regression Model Alcohol and quality.

RegModel.Alcohol <- lm(alcohol~quality, data=White_wines)
summary(RegModel.Alcohol)
## 
## Call:
## lm(formula = alcohol ~ quality, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2986 -0.7882 -0.1382  0.8014  4.1223 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.95670    0.10626   65.47   <2e-16 ***
## quality      0.60524    0.01788   33.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.108 on 4896 degrees of freedom
## Multiple R-squared:  0.1897, Adjusted R-squared:  0.1896 
## F-statistic:  1146 on 1 and 4896 DF,  p-value: < 2.2e-16

Regression Model fixed.acidity and quality.

RegModel.fixed.acidity <- lm(fixed.acidity~quality, data=White_wines)
summary(RegModel.fixed.acidity)
## 
## Call:
## lm(formula = fixed.acidity ~ quality, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0416 -0.5499 -0.0499  0.4667  7.3584 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.49138    0.08042  93.152  < 2e-16 ***
## quality     -0.10830    0.01353  -8.005 1.48e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8385 on 4896 degrees of freedom
## Multiple R-squared:  0.01292,    Adjusted R-squared:  0.01272 
## F-statistic: 64.08 on 1 and 4896 DF,  p-value: 1.48e-15

Regression Model volatile.acidity and quality.

RegModel.volatile.acidity <- lm(volatile.acidity~quality, data=White_wines)
summary(RegModel.volatile.acidity)
## 
## Call:
## lm(formula = volatile.acidity ~ quality, data = White_wines)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.20986 -0.06554 -0.01554  0.04446  0.78014 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.408504   0.009483   43.08   <2e-16 ***
## quality     -0.022161   0.001595  -13.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09888 on 4896 degrees of freedom
## Multiple R-squared:  0.03792,    Adjusted R-squared:  0.03772 
## F-statistic:   193 on 1 and 4896 DF,  p-value: < 2.2e-16

Regression Model citric.acid and quality.

RegModel.citric.acid <- lm(citric.acid~quality, data=White_wines)
summary(RegModel.citric.acid)
## 
## Call:
## lm(formula = citric.acid ~ quality, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3366 -0.0653 -0.0153  0.0547  1.3260 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.341588   0.011608  29.427   <2e-16 ***
## quality     -0.001258   0.001953  -0.644    0.519    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.121 on 4896 degrees of freedom
## Multiple R-squared:  8.481e-05,  Adjusted R-squared:  -0.0001194 
## F-statistic: 0.4153 on 1 and 4896 DF,  p-value: 0.5193

Regression Model residual.sugar and quality.

RegModel.residual.sugar <- lm(residual.sugar~quality, data=White_wines)
summary(RegModel.residual.sugar)
## 
## Call:
## lm(formula = residual.sugar ~ quality, data = White_wines)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.300 -4.482 -1.023  3.412 59.477 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.67613    0.48420   19.98  < 2e-16 ***
## quality     -0.55882    0.08146   -6.86 7.72e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.048 on 4896 degrees of freedom
## Multiple R-squared:  0.009521,   Adjusted R-squared:  0.009319 
## F-statistic: 47.06 on 1 and 4896 DF,  p-value: 7.724e-12

Regression Model chlorides and quality.

RegModel.chlorides <- lm(chlorides~quality, data=White_wines)
summary(RegModel.chlorides)
## 
## Call:
## lm(formula = chlorides ~ quality, data = White_wines)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.042498 -0.009319 -0.003140  0.003860  0.295681 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0762135  0.0020490   37.20   <2e-16 ***
## quality     -0.0051789  0.0003447  -15.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02136 on 4896 degrees of freedom
## Multiple R-squared:  0.04407,    Adjusted R-squared:  0.04388 
## F-statistic: 225.7 on 1 and 4896 DF,  p-value: < 2.2e-16

Regression Model free.sulfur.dioxide and quality.

RegModel.free.sulfur.dioxide <- lm(free.sulfur.dioxide~quality, data=White_wines)
summary(RegModel.free.sulfur.dioxide)
## 
## Call:
## lm(formula = free.sulfur.dioxide ~ quality, data = White_wines)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.171 -12.171  -1.484  10.516 254.143 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.3872     1.6313  21.080   <2e-16 ***
## quality       0.1567     0.2744   0.571    0.568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.01 on 4896 degrees of freedom
## Multiple R-squared:  6.655e-05,  Adjusted R-squared:  -0.0001377 
## F-statistic: 0.3259 on 1 and 4896 DF,  p-value: 0.5681

Regression Model total.sulfur.dioxide and quality.

RegModel.total.sulfur.dioxide <- lm(total.sulfur.dioxide~quality, data=White_wines)
summary(RegModel.total.sulfur.dioxide)
## 
## Call:
## lm(formula = total.sulfur.dioxide ~ quality, data = White_wines)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -144.107  -28.722   -2.337   28.278  277.508 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 187.6464     4.0138   46.75   <2e-16 ***
## quality      -8.3849     0.6752  -12.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.85 on 4896 degrees of freedom
## Multiple R-squared:  0.03053,    Adjusted R-squared:  0.03034 
## F-statistic: 154.2 on 1 and 4896 DF,  p-value: < 2.2e-16

Regression Model density and quality.

RegModel.density <- lm(density~quality, data=White_wines)
summary(RegModel.density)
## 
## Call:
## lm(formula = density ~ quality, data = White_wines)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.007718 -0.002104 -0.000361  0.001859  0.045079 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.000e+00  2.730e-04 3663.07   <2e-16 ***
## quality     -1.037e-03  4.593e-05  -22.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002847 on 4896 degrees of freedom
## Multiple R-squared:  0.09432,    Adjusted R-squared:  0.09414 
## F-statistic: 509.9 on 1 and 4896 DF,  p-value: < 2.2e-16

Regression Model pH and quality.

RegModel.pH <- lm(pH~quality, data=White_wines)
summary(RegModel.pH)
## 
## Call:
## lm(formula = pH ~ quality, data = White_wines)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47034 -0.10034 -0.01034  0.08966  0.61966 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.088623   0.014413 214.301  < 2e-16 ***
## quality     0.016952   0.002425   6.992 3.08e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1503 on 4896 degrees of freedom
## Multiple R-squared:  0.009886,   Adjusted R-squared:  0.009684 
## F-statistic: 48.88 on 1 and 4896 DF,  p-value: 3.081e-12

Regression Model sulphates and quality.

RegModel.sulphates <- lm(sulphates~quality, data=White_wines)
summary(RegModel.sulphates)
## 
## Call:
## lm(formula = sulphates ~ quality, data = White_wines)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27761 -0.08069 -0.01377  0.05931  0.58239 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.449189   0.010931  41.092  < 2e-16 ***
## quality     0.006917   0.001839   3.761 0.000171 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.114 on 4896 degrees of freedom
## Multiple R-squared:  0.002881,   Adjusted R-squared:  0.002678 
## F-statistic: 14.15 on 1 and 4896 DF,  p-value: 0.000171

From this we see neither citric acid nor free sulfur dioxide appear to have a significant linear relationship with quality.

Multiple Regressions

Now we will begin building a model using multiple regressions. However, prior to building the model, we will first split our dataset into a training and testing set.

set.seed(20170214) #Random Number seed is the date
White_wines$group <- runif(length(White_wines$quality), min = 0, max = 1) #create a new variable to add to dataset to distribute random numbers from 0-1

White_wines.train <- subset(White_wines, group <= 0.90) #assign 90% of the data to the training set
White_wines.test <- subset(White_wines, group > 0.90) #assign remaining data to the test set

#Did it work?
summary(White_wines.train)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.100  
##  Mean   : 6.851   Mean   :0.2784   Mean   :0.3337   Mean   : 6.342  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.800  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  3.00      Min.   :  9.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04574   Mean   : 35.28      Mean   :138.3       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH         sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.72   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.09   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.18   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.19   Mean   :0.4892   Mean   :10.52  
##  3rd Qu.:0.9960   3rd Qu.:3.28   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.82   Max.   :1.0800   Max.   :14.20  
##     quality          group          
##  Min.   :3.000   Min.   :0.0002833  
##  1st Qu.:5.000   1st Qu.:0.2285282  
##  Median :6.000   Median :0.4596618  
##  Mean   :5.879   Mean   :0.4570277  
##  3rd Qu.:6.000   3rd Qu.:0.6859608  
##  Max.   :9.000   Max.   :0.8998507
summary(White_wines.test)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 5.000   Min.   :0.0800   Min.   :0.0000   Min.   : 0.800  
##  1st Qu.: 6.400   1st Qu.:0.2175   1st Qu.:0.2600   1st Qu.: 2.100  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 6.300  
##  Mean   : 6.889   Mean   :0.2766   Mean   :0.3387   Mean   : 6.866  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.:10.400  
##  Max.   :10.200   Max.   :1.0050   Max.   :0.8800   Max.   :22.000  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.01400   Min.   :  2.0       Min.   : 24.0       
##  1st Qu.:0.03675   1st Qu.: 23.0       1st Qu.:108.0       
##  Median :0.04300   Median : 35.0       Median :135.0       
##  Mean   :0.04612   Mean   : 35.6       Mean   :139.4       
##  3rd Qu.:0.05000   3rd Qu.: 47.0       3rd Qu.:170.2       
##  Max.   :0.20400   Max.   :124.0       Max.   :260.0       
##     density             pH          sulphates        alcohol     
##  Min.   :0.9877   Min.   :2.770   Min.   :0.280   Min.   : 8.40  
##  1st Qu.:0.9918   1st Qu.:3.080   1st Qu.:0.400   1st Qu.: 9.40  
##  Median :0.9941   Median :3.170   Median :0.480   Median :10.20  
##  Mean   :0.9942   Mean   :3.174   Mean   :0.496   Mean   :10.45  
##  3rd Qu.:0.9964   3rd Qu.:3.260   3rd Qu.:0.560   3rd Qu.:11.30  
##  Max.   :1.0010   Max.   :3.690   Max.   :1.010   Max.   :13.90  
##     quality          group       
##  Min.   :3.000   Min.   :0.9001  
##  1st Qu.:5.000   1st Qu.:0.9229  
##  Median :6.000   Median :0.9528  
##  Mean   :5.872   Mean   :0.9506  
##  3rd Qu.:6.000   3rd Qu.:0.9758  
##  Max.   :8.000   Max.   :0.9993

Now we will begin with a full model including all variables.

LinearModel.Full <- lm(quality ~ alcohol + chlorides + citric.acid + 
  density + fixed.acidity + free.sulfur.dioxide + pH + residual.sugar 
  + sulphates + total.sulfur.dioxide + volatile.acidity, 
  data=White_wines.train)
summary(LinearModel.Full)
## 
## Call:
## lm(formula = quality ~ alcohol + chlorides + citric.acid + density + 
##     fixed.acidity + free.sulfur.dioxide + pH + residual.sugar + 
##     sulphates + total.sulfur.dioxide + volatile.acidity, data = White_wines.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8642 -0.4973 -0.0362  0.4704  3.0782 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.552e+02  1.937e+01   8.013 1.42e-15 ***
## alcohol               1.885e-01  2.510e-02   7.510 7.10e-14 ***
## chlorides            -2.444e-01  5.701e-01  -0.429 0.668114    
## citric.acid           4.294e-02  1.010e-01   0.425 0.670887    
## density              -1.555e+02  1.965e+01  -7.917 3.06e-15 ***
## fixed.acidity         8.103e-02  2.176e-02   3.724 0.000199 ***
## free.sulfur.dioxide   4.064e-03  8.870e-04   4.581 4.74e-06 ***
## pH                    7.268e-01  1.099e-01   6.614 4.19e-11 ***
## residual.sugar        8.492e-02  7.816e-03  10.865  < 2e-16 ***
## sulphates             6.578e-01  1.068e-01   6.156 8.10e-10 ***
## total.sulfur.dioxide -4.434e-04  3.963e-04  -1.119 0.263311    
## volatile.acidity     -1.822e+00  1.199e-01 -15.199  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7538 on 4426 degrees of freedom
## Multiple R-squared:  0.2805, Adjusted R-squared:  0.2787 
## F-statistic: 156.9 on 11 and 4426 DF,  p-value: < 2.2e-16

The full model can be used to explain 28% of the variability in taste. The F statistic is 156.9 and is highly significant. We will investigate what occurs as this model is reduced.

To continue we will use the backwards selection strategy and remove all variable that were not significant in the full model.

Reduced Model 1 will include alcohol, density, fixed acidity, free sulfur dioxide, pH, residal sugar, sulphates, volatile acidity.

LinearModel.2 <- lm(quality ~ alcohol +  density + fixed.acidity + 
  free.sulfur.dioxide +  pH + residual.sugar + sulphates +  volatile.acidity, 
  data=White_wines.train)
summary(LinearModel.2)
## 
## Call:
## lm(formula = quality ~ alcohol + density + fixed.acidity + free.sulfur.dioxide + 
##     pH + residual.sugar + sulphates + volatile.acidity, data = White_wines.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8536 -0.4930 -0.0388  0.4675  3.0889 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.598e+02  1.872e+01   8.535  < 2e-16 ***
## alcohol              1.888e-01  2.495e-02   7.566 4.64e-14 ***
## density             -1.603e+02  1.898e+01  -8.445  < 2e-16 ***
## fixed.acidity        8.386e-02  2.133e-02   3.931 8.58e-05 ***
## free.sulfur.dioxide  3.487e-03  7.137e-04   4.885 1.07e-06 ***
## pH                   7.325e-01  1.078e-01   6.792 1.25e-11 ***
## residual.sugar       8.639e-02  7.594e-03  11.377  < 2e-16 ***
## sulphates            6.524e-01  1.064e-01   6.130 9.57e-10 ***
## volatile.acidity    -1.861e+00  1.152e-01 -16.150  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7537 on 4429 degrees of freedom
## Multiple R-squared:  0.2802, Adjusted R-squared:  0.2789 
## F-statistic: 215.5 on 8 and 4429 DF,  p-value: < 2.2e-16

This reduced model can still be used to explain 28% of the variability in taste. The F statistic increased to 215.5 and is highly significant. We will investigate what occurs as this model is reduced.

View influential variables

#added variable plots
avPlots(LinearModel.2, id.n=2, id.cex=0.7)

#id.n - identify n most influential observations so you can pick out outlier values labeling them as farmers babysitters etc
#id.cex - controls the size of the dot

Look at residuals

# run the qq-plot
qqPlot(LinearModel.2, id.n=3)

## 4746  254 2782 
##    1    2 4438
# here, id.n identifies the n observations with the largest residuals in absolute value
# diagnostics for the first model with 3 independent variables
residualPlots(LinearModel.2)

##                     Test stat Pr(>|t|)
## alcohol                 5.191    0.000
## density                 5.552    0.000
## fixed.acidity          -4.163    0.000
## free.sulfur.dioxide   -10.160    0.000
## pH                      0.880    0.379
## residual.sugar          2.520    0.012
## sulphates               0.729    0.466
## volatile.acidity        3.184    0.001
## Tukey test              2.551    0.011

Outlier

#run Bonferroni test for outliers
outlierTest(LinearModel.2)
##       rstudent unadjusted p-value Bonferonni p
## 4746 -5.285819         1.3116e-07   0.00058211
## 2782  4.931011         8.4800e-07   0.00376340
## 254  -4.496908         7.0712e-06   0.03138200
## 446  -4.485892         7.4449e-06   0.03304100

Influence Plot

#make influence plot
influencePlot(LinearModel.2, id.n=3)

##         StudRes         Hat       CookD
## 254  -4.4969082 0.002555952 0.005732828
## 1527 -0.6554449 0.038237083 0.001898028
## 1932 -3.7826585 0.015025299 0.024179469
## 2782  4.9310113 0.351726593 1.458130190
## 4746 -5.2858195 0.058668586 0.192314295

Heteroskedascity

#test for heteroskedasticity
ncvTest(LinearModel.2) #tests for non constant variance. 
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 16.04371    Df = 1     p = 6.189685e-05
vif(LinearModel.2)
##             alcohol             density       fixed.acidity 
##            7.337053           25.278419            2.545179 
## free.sulfur.dioxide                  pH      residual.sugar 
##            1.151922            2.083100           11.620123 
##           sulphates    volatile.acidity 
##            1.126483            1.060061
#if higher than 4 we want to take variable out b/c it is not independent and highly correlates with something in there 

Based on the previous plots/analysis we further reduce the model. will remove density, and residual sugar, and free sulfur dioxide from the analysis.

LinearModel.3 <- lm(quality ~ alcohol + fixed.acidity + 
  free.sulfur.dioxide +  pH + residual.sugar + sulphates +  volatile.acidity, 
  data=White_wines.train)
summary(LinearModel.3)
## 
## Call:
## lm(formula = quality ~ alcohol + fixed.acidity + free.sulfur.dioxide + 
##     pH + residual.sugar + sulphates + volatile.acidity, data = White_wines.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8938 -0.4962 -0.0333  0.4624  3.1774 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.7101754  0.3536637   4.836 1.37e-06 ***
## alcohol              0.3800365  0.0105722  35.947  < 2e-16 ***
## fixed.acidity       -0.0451858  0.0150020  -3.012 0.002610 ** 
## free.sulfur.dioxide  0.0037010  0.0007189   5.148 2.74e-07 ***
## pH                   0.1719520  0.0856708   2.007 0.044797 *  
## residual.sugar       0.0261723  0.0026316   9.946  < 2e-16 ***
## sulphates            0.3989868  0.1029242   3.877 0.000107 ***
## volatile.acidity    -2.0134448  0.1146832 -17.557  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7597 on 4430 degrees of freedom
## Multiple R-squared:  0.2686, Adjusted R-squared:  0.2675 
## F-statistic: 232.5 on 7 and 4430 DF,  p-value: < 2.2e-16

This reduced model can still be used to explain 27% of the variability in taste. The F statistic increased to 232.5 and is still highly significant.

# diagnostics for the first model with 3 independent variables
residualPlots(LinearModel.3)

##                     Test stat Pr(>|t|)
## alcohol                 5.243    0.000
## fixed.acidity          -3.584    0.000
## free.sulfur.dioxide   -10.370    0.000
## pH                      0.386    0.700
## residual.sugar         -2.049    0.041
## sulphates               0.878    0.380
## volatile.acidity        1.968    0.049
## Tukey test              0.145    0.884

We will investigate what occurs as this model is further reduced by removing free sulfur dioxide.

LinearModel.4 <- lm(quality ~ alcohol + fixed.acidity +  pH + residual.sugar + sulphates + volatile.acidity, 
  data=White_wines.train)
summary(LinearModel.4)
## 
## Call:
## lm(formula = quality ~ alcohol + fixed.acidity + pH + residual.sugar + 
##     sulphates + volatile.acidity, data = White_wines.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4043 -0.4962 -0.0369  0.4662  3.1503 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.914453   0.352441   5.432 5.87e-08 ***
## alcohol           0.373244   0.010520  35.481  < 2e-16 ***
## fixed.acidity    -0.051461   0.014995  -3.432 0.000605 ***
## pH                0.178963   0.085906   2.083 0.037286 *  
## residual.sugar    0.029424   0.002562  11.485  < 2e-16 ***
## sulphates         0.432488   0.103013   4.198 2.74e-05 ***
## volatile.acidity -2.080385   0.114271 -18.206  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7618 on 4431 degrees of freedom
## Multiple R-squared:  0.2643, Adjusted R-squared:  0.2633 
## F-statistic: 265.3 on 6 and 4431 DF,  p-value: < 2.2e-16

This reduced model can still be used to explain 26% of the variability in taste. The F statistic increased to 265.3 and is still highly significant.

# diagnostics for the first model with 3 independent variables
residualPlots(LinearModel.4)

##                  Test stat Pr(>|t|)
## alcohol              5.496    0.000
## fixed.acidity       -3.795    0.000
## pH                   0.165    0.869
## residual.sugar      -2.603    0.009
## sulphates            1.054    0.292
## volatile.acidity     1.760    0.078
## Tukey test          -0.203    0.839
vif(LinearModel.4)
##          alcohol    fixed.acidity               pH   residual.sugar 
##         1.276125         1.230980         1.293699         1.294518 
##        sulphates volatile.acidity 
##         1.032790         1.020648
#if higher than 4 we want to take variable out b/c it is not independent and highly correlates with something in there 

Residual sugar

LinearModel.5 <- lm(quality ~ alcohol + fixed.acidity + residual.sugar + sulphates + volatile.acidity, 
  data=White_wines.train)
summary(LinearModel.5)
## 
## Call:
## lm(formula = quality ~ alcohol + fixed.acidity + residual.sugar + 
##     sulphates + volatile.acidity, data = White_wines.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3580 -0.4939 -0.0352  0.4642  3.1857 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.560289   0.167709   15.27  < 2e-16 ***
## alcohol           0.373620   0.010522   35.51  < 2e-16 ***
## fixed.acidity    -0.064489   0.013634   -4.73 2.31e-06 ***
## residual.sugar    0.028655   0.002536   11.30  < 2e-16 ***
## sulphates         0.468359   0.101603    4.61 4.15e-06 ***
## volatile.acidity -2.088819   0.114243  -18.28  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7621 on 4432 degrees of freedom
## Multiple R-squared:  0.2635, Adjusted R-squared:  0.2627 
## F-statistic: 317.2 on 5 and 4432 DF,  p-value: < 2.2e-16

This reduced model can still be used to explain 26% of the variability in taste. The F statistic increased to 317.2 and is still highly significant. An increase in the adjusted R squared indicates it may fit better than the previous model.

# diagnostics for the first model with 3 independent variables
residualPlots(LinearModel.5)

##                  Test stat Pr(>|t|)
## alcohol              5.143    0.000
## fixed.acidity       -3.568    0.000
## residual.sugar      -2.491    0.013
## sulphates            0.929    0.353
## volatile.acidity     1.930    0.054
## Tukey test          -0.471    0.637
vif(LinearModel.5)
##          alcohol    fixed.acidity   residual.sugar        sulphates 
##         1.275751         1.016852         1.267666         1.003936 
## volatile.acidity 
##         1.019367
#if higher than 4 we want to take variable out b/c it is not independent and highly correlates with something in there 

Based on the residuals I would like to see what happens when residual sugar and fixed acidity are removed from the model.

LinearModel.5 <- lm(quality ~ alcohol + sulphates + volatile.acidity, 
  data=White_wines.train)
summary(LinearModel.5)
## 
## Call:
## lm(formula = quality ~ alcohol + sulphates + volatile.acidity, 
##     data = White_wines.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3158 -0.4886 -0.0468  0.4947  3.1571 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.785383   0.116676  23.873  < 2e-16 ***
## alcohol           0.325036   0.009493  34.239  < 2e-16 ***
## sulphates         0.434956   0.103148   4.217 2.53e-05 ***
## volatile.acidity -1.936946   0.115352 -16.792  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7744 on 4434 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2388 
## F-statistic:   465 on 3 and 4434 DF,  p-value: < 2.2e-16

This model does not appear to fit better than the previous model. It only accounts for 24% of the variability and while still significant the adjust R squared value has decreased from .26 to .23.

compareCoefs(LinearModel.2, LinearModel.3, LinearModel.4, LinearModel.5, 
  LinearModel.Full)
## 
## Call:
## 1: lm(formula = quality ~ alcohol + density + fixed.acidity + 
##   free.sulfur.dioxide + pH + residual.sugar + sulphates + 
##   volatile.acidity, data = White_wines.train)
## 2: lm(formula = quality ~ alcohol + fixed.acidity + 
##   free.sulfur.dioxide + pH + residual.sugar + sulphates + 
##   volatile.acidity, data = White_wines.train)
## 3: lm(formula = quality ~ alcohol + fixed.acidity + pH + 
##   residual.sugar + sulphates + volatile.acidity, data = 
##   White_wines.train)
## 4: lm(formula = quality ~ alcohol + sulphates + volatile.acidity, data 
##   = White_wines.train)
## 5: lm(formula = quality ~ alcohol + chlorides + citric.acid + density 
##   + fixed.acidity + free.sulfur.dioxide + pH + residual.sugar + 
##   sulphates + total.sulfur.dioxide + volatile.acidity, data = 
##   White_wines.train)
##                         Est. 1      SE 1    Est. 2      SE 2    Est. 3
## (Intercept)           1.60e+02  1.87e+01  1.71e+00  3.54e-01  1.91e+00
## alcohol               1.89e-01  2.50e-02  3.80e-01  1.06e-02  3.73e-01
## density              -1.60e+02  1.90e+01                              
## fixed.acidity         8.39e-02  2.13e-02 -4.52e-02  1.50e-02 -5.15e-02
## free.sulfur.dioxide   3.49e-03  7.14e-04  3.70e-03  7.19e-04          
## pH                    7.32e-01  1.08e-01  1.72e-01  8.57e-02  1.79e-01
## residual.sugar        8.64e-02  7.59e-03  2.62e-02  2.63e-03  2.94e-02
## sulphates             6.52e-01  1.06e-01  3.99e-01  1.03e-01  4.32e-01
## volatile.acidity     -1.86e+00  1.15e-01 -2.01e+00  1.15e-01 -2.08e+00
## chlorides                                                             
## citric.acid                                                           
## total.sulfur.dioxide                                                  
##                           SE 3    Est. 4      SE 4    Est. 5      SE 5
## (Intercept)           3.52e-01  2.79e+00  1.17e-01  1.55e+02  1.94e+01
## alcohol               1.05e-02  3.25e-01  9.49e-03  1.88e-01  2.51e-02
## density                                            -1.56e+02  1.96e+01
## fixed.acidity         1.50e-02                      8.10e-02  2.18e-02
## free.sulfur.dioxide                                 4.06e-03  8.87e-04
## pH                    8.59e-02                      7.27e-01  1.10e-01
## residual.sugar        2.56e-03                      8.49e-02  7.82e-03
## sulphates             1.03e-01  4.35e-01  1.03e-01  6.58e-01  1.07e-01
## volatile.acidity      1.14e-01 -1.94e+00  1.15e-01 -1.82e+00  1.20e-01
## chlorides                                          -2.44e-01  5.70e-01
## citric.acid                                         4.29e-02  1.01e-01
## total.sulfur.dioxide                               -4.43e-04  3.96e-04
# compare the results of the two regression models
stargazer(LinearModel.3,LinearModel.4, LinearModel.5,title="Comparison of Regression outputs",type="text",align=TRUE)

Comparison of Regression outputs

                                             Dependent variable:                             
                -----------------------------------------------------------------------------
                                                   quality                                   
                           (1)                       (2)                       (3)           
alcohol 0.380*** 0.373*** 0.325*** (0.011) (0.011) (0.009)
fixed.acidity -0.045*** -0.051*** (0.015) (0.015)
free.sulfur.dioxide 0.004*** (0.001)
pH 0.172** 0.179** (0.086) (0.086)
residual.sugar 0.026*** 0.029*** (0.003) (0.003)
sulphates 0.399*** 0.432*** 0.435*** (0.103) (0.103) (0.103)
volatile.acidity -2.013*** -2.080*** -1.937*** (0.115) (0.114) (0.115)
Constant 1.710*** 1.914*** 2.785*** (0.354) (0.352) (0.117)

Observations 4,438 4,438 4,438
R2 0.269 0.264 0.239
Adjusted R2 0.267 0.263 0.239
Residual Std. Error 0.760 (df = 4430) 0.762 (df = 4431) 0.774 (df = 4434)
F Statistic 232.461*** (df = 7; 4430) 265.260*** (df = 6; 4431) 465.047*** (df = 3; 4434) ================================================================================================= Note: p<0.1; p<0.05; p<0.01

#can only be seen when knitting to html if you change type to text you can see the table now type=html or text or latek as options
#test for heteroskedasticity
ncvTest(LinearModel.4) #tests for non constant variance. All biomarkers fail this test. since p is big its a homoskedastic set
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 26.78513    Df = 1     p = 2.273799e-07
vif(LinearModel.4)
##          alcohol    fixed.acidity               pH   residual.sugar 
##         1.276125         1.230980         1.293699         1.294518 
##        sulphates volatile.acidity 
##         1.032790         1.020648
#if higher than 4 we want to take variable out b/c it is not independent and highly correlates with something in there 
#make influence plot
influencePlot(LinearModel.4, id.n=3)

##         StudRes         Hat        CookD
## 254  -4.4819619 0.001694585 0.0048503308
## 446  -4.2795152 0.001112097 0.0029015056
## 741  -4.3948589 0.001592595 0.0043832688
## 1418 -3.5659407 0.004188520 0.0076205600
## 1527  0.5376494 0.020892764 0.0008813241
## 2051 -3.3491187 0.008253723 0.0133049158
## 2782 -0.9626167 0.053125121 0.0074271681
## 4040 -1.1545176 0.015677406 0.0030325443
## 4481 -3.0340351 0.006500608 0.0085886727

Based on this data I believe LinearModel.4 to be the best model of this data. I am not pleased with the plots of the residuals or influential points. I would also like to include less variables in the model. However I am not sure how much of to trade for the amount of variability accounted for by the model and these other things. Currently the model accounts for 26% of the variability in the score for quality.

We will now run the model on the testing dataset to see if it is a good predictor of the values.

predict(LinearModel.4, White_wines.test)
##        5       22       30       49       73       94       96       97 
## 5.754555 5.849885 6.389817 5.543937 5.885102 6.637284 5.532576 5.428932 
##      108      128      144      146      153      156      160      161 
## 5.773299 5.775722 5.585470 5.866041 5.367857 5.467006 6.032974 6.377618 
##      170      178      192      202      208      216      226      233 
## 5.197689 5.776077 5.617164 5.320064 5.107399 5.610988 5.583584 5.519034 
##      259      261      269      286      295      318      322      323 
## 6.374263 6.061912 5.098319 5.123943 4.180785 5.952060 5.952060 5.877922 
##      338      351      355      360      379      388      390      405 
## 5.656004 6.562630 5.324585 5.982613 5.617402 5.566075 5.672618 6.860365 
##      422      428      429      442      447      451      465      466 
## 6.070248 5.608655 5.254582 5.774928 6.551588 4.759324 5.085239 5.496366 
##      498      503      534      550      562      572      575      584 
## 6.620995 5.563136 5.507412 4.887359 5.445372 6.178006 6.298038 5.637185 
##      586      591      616      624      631      634      648      657 
## 5.637185 5.443775 5.357665 6.130614 5.649293 5.696528 5.758915 5.968483 
##      661      663      676      702      706      708      714      726 
## 5.259055 4.858217 5.361469 7.109821 5.822284 6.307564 5.673433 5.378833 
##      728      748      781      837      844      854      855      857 
## 5.651501 6.138997 5.772917 6.276716 6.053175 5.106925 5.765050 5.398917 
##      867      876      878      879      883      891      895      902 
## 5.765050 5.728577 5.612588 5.363674 6.259618 5.758706 6.001125 5.649969 
##      905      912      917      922      945      980      988      992 
## 5.221398 5.604962 6.089411 5.236697 5.465398 5.656118 5.659932 5.627005 
##     1000     1002     1011     1012     1028     1055     1076     1090 
## 5.697409 5.463115 5.957041 6.280983 4.828393 5.576456 6.397113 6.014210 
##     1091     1106     1109     1137     1146     1147     1156     1167 
## 5.784013 6.242261 6.545170 6.590166 5.180613 5.592791 5.637328 4.950176 
##     1173     1182     1213     1214     1223     1233     1235     1241 
## 6.761323 5.406320 5.925312 5.821558 5.563886 6.266618 5.613622 5.506863 
##     1261     1266     1299     1300     1312     1315     1316     1341 
## 5.444045 6.162557 6.406539 6.169369 6.010110 5.356462 5.853127 5.694987 
##     1350     1366     1370     1394     1395     1397     1404     1405 
## 5.271930 5.842960 4.820627 5.720809 6.854030 6.197566 6.015806 6.013556 
##     1407     1433     1442     1445     1454     1456     1473     1481 
## 5.856961 6.199309 5.576169 5.931977 5.984055 5.502641 6.055716 6.032355 
##     1518     1523     1541     1586     1624     1632     1633     1659 
## 5.906402 6.123471 5.887483 5.272583 5.702082 6.193954 6.572516 5.193990 
##     1670     1677     1704     1710     1717     1757     1758     1762 
## 5.386829 6.062452 5.410414 5.949095 6.124591 5.476755 5.251264 5.639551 
##     1765     1768     1790     1804     1811     1815     1871     1880 
## 5.476755 5.125231 6.531564 5.805759 5.311551 6.192163 5.900968 5.741693 
##     1882     1883     1891     1926     1928     1936     1938     1942 
## 5.741693 5.590267 5.590267 5.699303 5.718286 5.570971 6.293422 5.228709 
##     1952     1977     1985     1988     1993     2014     2020     2029 
## 3.914879 5.871837 5.607854 5.795175 5.889762 6.148233 6.148233 5.598355 
##     2044     2073     2088     2091     2107     2121     2129     2133 
## 6.105045 5.720818 5.495504 5.448895 5.610094 5.428639 4.911079 5.939254 
##     2141     2142     2149     2163     2183     2190     2200     2201 
## 5.770479 5.770479 5.399303 4.650740 5.745360 5.798194 5.737759 5.507977 
##     2210     2216     2271     2290     2309     2312     2329     2383 
## 6.330326 5.728844 6.020957 6.417134 5.134960 6.432918 5.485319 6.037306 
##     2385     2386     2392     2396     2404     2417     2423     2434 
## 6.037306 5.457223 5.954604 5.840390 6.530634 6.048425 5.288937 5.309192 
##     2456     2477     2481     2498     2525     2552     2558     2560 
## 5.536748 5.124726 5.578971 5.737561 5.512502 5.605980 6.366352 5.388761 
##     2563     2584     2586     2589     2621     2638     2643     2650 
## 5.833609 5.687656 5.687656 5.687656 5.571208 5.401601 5.731640 6.036431 
##     2657     2668     2674     2682     2691     2703     2715     2746 
## 5.870158 6.313997 5.990224 5.739648 5.288876 5.626365 5.581857 6.370984 
##     2753     2766     2776     2784     2785     2788     2793     2795 
## 6.319638 6.250635 6.163672 6.080017 6.080017 5.860250 5.566642 5.364473 
##     2799     2801     2809     2822     2837     2887     2891     2924 
## 5.982030 5.512134 5.469431 5.506068 5.539831 5.425472 6.551646 6.081222 
##     2932     2943     2945     2957     2959     2960     2961     2962 
## 6.766951 5.509591 5.916456 6.492353 5.739704 6.535618 6.613038 6.231767 
##     3015     3043     3052     3058     3062     3066     3084     3105 
## 6.020093 5.748245 5.386282 6.804767 6.036641 6.100894 7.023438 6.074370 
##     3108     3115     3117     3127     3128     3136     3138     3139 
## 5.823993 6.343635 6.109906 6.249132 5.915144 5.239467 6.007609 6.087446 
##     3161     3166     3169     3180     3181     3193     3209     3233 
## 6.514180 5.480466 6.717271 5.700652 6.436530 6.512904 6.005123 6.725933 
##     3238     3252     3267     3292     3294     3298     3300     3308 
## 6.089750 5.878405 6.449780 6.982516 6.510050 5.479262 6.429623 5.970557 
##     3323     3330     3335     3344     3349     3368     3369     3381 
## 6.646428 6.036378 5.737937 5.551329 6.233818 6.707571 5.812229 6.414225 
##     3399     3407     3413     3426     3427     3433     3437     3459 
## 5.577143 6.114846 5.891664 5.955132 5.955132 6.025953 6.446375 7.153511 
##     3468     3487     3502     3509     3511     3516     3525     3526 
## 5.401801 6.097220 6.208526 5.592099 6.114772 6.617752 6.183876 6.183876 
##     3528     3541     3542     3549     3567     3569     3570     3571 
## 5.650672 6.576099 5.971824 6.223617 6.273472 6.311837 6.139212 6.161398 
##     3591     3600     3606     3611     3612     3616     3628     3632 
## 6.565801 5.394465 5.356274 5.356274 6.313549 6.430932 5.291766 5.929097 
##     3633     3644     3648     3667     3669     3675     3696     3702 
## 5.721435 6.219669 6.583262 6.173623 6.227363 5.125455 6.278390 5.765670 
##     3707     3710     3712     3721     3729     3733     3736     3748 
## 6.405111 5.484367 5.586235 6.257105 7.155641 5.610276 6.910907 5.476774 
##     3771     3775     3789     3797     3801     3820     3821     3824 
## 6.344864 5.515612 5.573958 5.957831 5.815028 6.194149 5.769951 5.591405 
##     3828     3840     3844     3850     3861     3864     3871     3872 
## 6.240179 4.955945 6.654612 5.864755 5.382777 5.694787 5.956016 5.694787 
##     3873     3931     3948     3954     3977     3981     4023     4041 
## 5.150634 6.298004 6.564024 5.887486 6.734503 6.240065 5.193307 5.437248 
##     4045     4055     4060     4066     4074     4078     4116     4134 
## 5.609037 5.777976 6.278882 6.604508 4.781410 6.096600 6.256166 6.200459 
##     4138     4151     4166     4168     4180     4193     4251     4253 
## 5.277407 5.123081 6.412160 6.920872 5.892801 5.853021 6.158141 6.158141 
##     4270     4276     4280     4289     4295     4314     4328     4345 
## 5.289500 5.731301 6.251883 5.944465 6.164465 6.238063 5.647516 5.192819 
##     4351     4355     4359     4360     4387     4398     4400     4402 
## 6.659673 6.080761 5.657107 5.422664 6.108683 5.793002 5.790480 6.630611 
##     4403     4420     4422     4425     4426     4430     4437     4451 
## 6.158496 5.660858 5.660858 5.660858 5.820353 5.857527 6.413331 6.023727 
##     4456     4476     4480     4488     4503     4516     4523     4528 
## 5.728444 6.681839 4.758790 6.367368 6.134158 6.032092 5.513647 5.719944 
##     4533     4552     4562     4568     4583     4629     4640     4646 
## 6.396449 6.031559 6.428608 6.225140 5.916541 6.135481 5.545675 6.812276 
##     4657     4666     4667     4675     4676     4678     4684     4690 
## 6.221562 6.793892 5.353244 6.174447 6.129973 6.417458 6.184083 5.634909 
##     4697     4701     4715     4716     4736     4745     4750     4764 
## 7.162050 5.771801 6.273579 5.945358 6.142887 5.940731 5.708126 6.153141 
##     4766     4771     4783     4817     4824     4830     4832     4834 
## 6.763756 5.750237 5.396718 5.781465 6.383357 6.488271 5.249450 5.918841 
##     4837     4858     4867     4885 
## 5.219486 5.996485 5.831067 5.498981
LinearModel.test <- lm(quality ~ alcohol + fixed.acidity +  pH + residual.sugar + sulphates + volatile.acidity, 
  data=White_wines.test)
summary(LinearModel.test)
## 
## Call:
## lm(formula = quality ~ alcohol + fixed.acidity + pH + residual.sugar + 
##     sulphates + volatile.acidity, data = White_wines.test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.72234 -0.48958 -0.04004  0.47059  2.60770 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.095325   1.116756   3.667 0.000274 ***
## alcohol           0.339922   0.031396  10.827  < 2e-16 ***
## fixed.acidity    -0.171231   0.046668  -3.669 0.000272 ***
## pH               -0.068681   0.265655  -0.259 0.796111    
## residual.sugar    0.014961   0.007941   1.884 0.060183 .  
## sulphates         0.257427   0.271472   0.948 0.343502    
## volatile.acidity -2.201534   0.347204  -6.341 5.54e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7241 on 453 degrees of freedom
## Multiple R-squared:  0.3124, Adjusted R-squared:  0.3033 
## F-statistic:  34.3 on 6 and 453 DF,  p-value: < 2.2e-16

Running this model on the test data provides a significant F statistic, with the model explaining 30% of the variability in the score for quality. This model uses alcohol, fixed acidity, pH, residual sugar, sulphates, volatile acidity to explain quality. This equation for this model is:

Y = 4.10 + (0.34)x1 + (-0.17)x2 + (-0.07)x3 + (0.01)x4 + (0.25)x5 + (-2.20)x6 + E

Where: Y= quality x1= alcohol x2= fixed acidity x3= pH x4= residual sugar x5= sulphates x6= volatile acid E= Error

Using this model it appears volatile acid influences quality the most. When keeping the other variables constant a 1 point change in volatile acid will cause a -2.20 change in the quality score of the wine.